{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/v-leyzhang/conda/fairseq/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import librosa\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "from torchaudio import load\n",
    "import torchaudio\n",
    "from scipy.io.wavfile import read, write\n",
    "import torchaudio\n",
    "from librosa.util import normalize\n",
    "from librosa.filters import mel as librosa_mel_fn\n",
    "import numpy as np\n",
    "import librosa\n",
    "import argparse\n",
    "import librosa.display\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "import soundfile as sf\n",
    "import matplotlib.pyplot as plt\n",
    "import parser\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:fairseq.tasks.text_to_speech:Please install tensorboardX: pip install tensorboardX\n",
      "INFO:fairseq.tasks.hubert_pretraining:current directory is /home/v-leyzhang\n",
      "INFO:fairseq.tasks.hubert_pretraining:HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/ntuanh/datasets/Fisher/fisher-vad-10s-separated/manifest', 'fine_tuning': False, 'labels': ['km'], 'label_dir': '/checkpoint/ntuanh/experiments/dialogue-LM/hubert/fisher-vad-10s-separated/kmeans/hubert_iter2/km500', 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}\n",
      "INFO:fairseq.models.hubert.hubert:HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.05, 'dropout_input': 0.1, 'dropout_features': 0.1, 'final_dim': 256, 'untie_final_proj': True, 'layer_norm_first': False, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'logit_temp': 0.1, 'target_glu': False, 'feature_grad_mult': 0.1, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': static, 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': static, 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'conv_pos': 128, 'conv_pos_groups': 16, 'conv_pos_batch_norm': False, 'latent_temp': [2.0, 0.5, 0.999995], 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'required_seq_len_multiple': 2, 'depthwise_conv_kernel_size': 31, 'attn_type': '', 'pos_enc_type': 'abs', 'fp16': False}\n",
      "/data/v-leyzhang/conda/fairseq/lib/python3.8/site-packages/sklearn/base.py:348: InconsistentVersionWarning: Trying to unpickle estimator MiniBatchKMeans from version 0.24.2 when using version 1.3.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
      "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from examples.textless_nlp.dgslm.dgslm_utils import HubertTokenizer\n",
    "encoder = HubertTokenizer(\n",
    "    hubert_path = \"/data/v-leyzhang/dGSLM/hubert_fisher.pt\",\n",
    "    hubert_layer = 12,\n",
    "    km_path = \"/data/v-leyzhang/dGSLM/hubert_fisher_km_500.bin\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "MAX_WAV_VALUE = 32768.0\n",
    "\n",
    "def load_wav(full_path):\n",
    "    sampling_rate, data = read(full_path)\n",
    "    return data, sampling_rate\n",
    "\n",
    "def dynamic_range_compression(x, C=1, clip_val=1e-5):\n",
    "    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)\n",
    "\n",
    "def dynamic_range_decompression(x, C=1):\n",
    "    return np.exp(x) / C\n",
    "\n",
    "def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):\n",
    "    return torch.log(torch.clamp(x, min=clip_val) * C)\n",
    "\n",
    "def dynamic_range_decompression_torch(x, C=1):\n",
    "    return torch.exp(x) / C\n",
    "\n",
    "def spectral_normalize_torch(magnitudes):\n",
    "    output = dynamic_range_compression_torch(magnitudes)\n",
    "    return output\n",
    "\n",
    "def spectral_de_normalize_torch(magnitudes):\n",
    "    output = dynamic_range_decompression_torch(magnitudes)\n",
    "    return output\n",
    "\n",
    "mel_basis = {}\n",
    "hann_window = {}\n",
    "\n",
    "def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):\n",
    "    if torch.min(y) < -1.:\n",
    "        print('min value is ', torch.min(y))\n",
    "    if torch.max(y) > 1.:\n",
    "        print('max value is ', torch.max(y))\n",
    "\n",
    "    global mel_basis, hann_window\n",
    "    if fmax not in mel_basis:\n",
    "        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)\n",
    "        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)\n",
    "        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)\n",
    "\n",
    "    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')\n",
    "    y = y.squeeze(1)\n",
    "\n",
    "    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],\n",
    "                      center=center, pad_mode='reflect', normalized=False, onesided=True,return_complex=False)\n",
    "\n",
    "    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))\n",
    "\n",
    "    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)\n",
    "    spec = spectral_normalize_torch(spec)\n",
    "\n",
    "    return spec\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "Failed to open the input \"/data/v-leyzhang/dGSLM/val_only_laughter/11298-A/fe_03_11298-A_045.wav\" (No such file or directory).",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[1;32m/home/v-leyzhang/fairseq/get_fisher_semantic_tokens.ipynb 单元格 4\u001b[0m line \u001b[0;36m2\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22535043484750553031227d/home/v-leyzhang/fairseq/get_fisher_semantic_tokens.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m path \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m/data/v-leyzhang/dGSLM/val_only_laughter/11298-A/fe_03_11298-A_045.wav\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22535043484750553031227d/home/v-leyzhang/fairseq/get_fisher_semantic_tokens.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m wav, _ \u001b[39m=\u001b[39m load(path)\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22535043484750553031227d/home/v-leyzhang/fairseq/get_fisher_semantic_tokens.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mwav\u001b[39m\u001b[39m\"\u001b[39m, wav\u001b[39m.\u001b[39mshape)\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22535043484750553031227d/home/v-leyzhang/fairseq/get_fisher_semantic_tokens.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m mel \u001b[39m=\u001b[39m mel_spectrogram(wav, n_fft\u001b[39m=\u001b[39m\u001b[39m480\u001b[39m, num_mels\u001b[39m=\u001b[39m\u001b[39m80\u001b[39m, sampling_rate\u001b[39m=\u001b[39m_,\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22535043484750553031227d/home/v-leyzhang/fairseq/get_fisher_semantic_tokens.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m                         hop_size\u001b[39m=\u001b[39m\u001b[39m160\u001b[39m, win_size\u001b[39m=\u001b[39m\u001b[39m480\u001b[39m, fmin\u001b[39m=\u001b[39m\u001b[39m0\u001b[39m, fmax\u001b[39m=\u001b[39m\u001b[39m4000\u001b[39m)\n",
      "File \u001b[0;32m/data/v-leyzhang/conda/fairseq/lib/python3.8/site-packages/torchaudio/backend/sox_io_backend.py:227\u001b[0m, in \u001b[0;36mload\u001b[0;34m(filepath, frame_offset, num_frames, normalize, channels_first, format)\u001b[0m\n\u001b[1;32m    225\u001b[0m \u001b[39mif\u001b[39;00m ret \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m    226\u001b[0m     \u001b[39mreturn\u001b[39;00m ret\n\u001b[0;32m--> 227\u001b[0m \u001b[39mreturn\u001b[39;00m _fallback_load(filepath, frame_offset, num_frames, normalize, channels_first, \u001b[39mformat\u001b[39;49m)\n",
      "File \u001b[0;32m/data/v-leyzhang/conda/fairseq/lib/python3.8/site-packages/torchaudio/io/_compat.py:102\u001b[0m, in \u001b[0;36mload_audio\u001b[0;34m(src, frame_offset, num_frames, convert, channels_first, format)\u001b[0m\n\u001b[1;32m     94\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mload_audio\u001b[39m(\n\u001b[1;32m     95\u001b[0m     src: \u001b[39mstr\u001b[39m,\n\u001b[1;32m     96\u001b[0m     frame_offset: \u001b[39mint\u001b[39m \u001b[39m=\u001b[39m \u001b[39m0\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    100\u001b[0m     \u001b[39mformat\u001b[39m: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    101\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tuple[torch\u001b[39m.\u001b[39mTensor, \u001b[39mint\u001b[39m]:\n\u001b[0;32m--> 102\u001b[0m     s \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mclasses\u001b[39m.\u001b[39;49mtorchaudio\u001b[39m.\u001b[39;49mffmpeg_StreamReader(src, \u001b[39mformat\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m)\n\u001b[1;32m    103\u001b[0m     \u001b[39mreturn\u001b[39;00m _load_audio(s, frame_offset, num_frames, convert, channels_first)\n",
      "\u001b[0;31mRuntimeError\u001b[0m: Failed to open the input \"/data/v-leyzhang/dGSLM/val_only_laughter/11298-A/fe_03_11298-A_045.wav\" (No such file or directory)."
     ]
    }
   ],
   "source": [
    "path = \"/data/v-leyzhang/dGSLM/val_only_laughter/11298-A/fe_03_11298-A_045.wav\"\n",
    "wav, _ = load(path)\n",
    "print(\"wav\", wav.shape)\n",
    "mel = mel_spectrogram(wav, n_fft=480, num_mels=80, sampling_rate=_,\n",
    "                        hop_size=160, win_size=480, fmin=0, fmax=4000)\n",
    "\n",
    "print(\"mel\",mel.shape)\n",
    "plt.imshow(mel.squeeze().cpu().numpy(), origin='lower')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_list = [\"777_126732_000076_000008_8k_16k.wav\",\"LJ001-0001_8k_16k.wav\"]\n",
    "for filename in file_list:\n",
    "    path_16k = \"/home/v-leyzhang/demo/conversation_TTS/20231126-conversation-demo/\"\n",
    "    codes = encoder.wav2code(os.path.join(path_16k,filename),1)\n",
    "    codes = codes.split(\" \")\n",
    "    hubert_codes = np.array(codes)\n",
    "    np.save(os.path.join(\"/home/v-leyzhang/demo/conversation_TTS/20231126-conversation-demo\", filename.replace(\".wav\",\"_hubert_code\")), hubert_codes)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "checkpoint_file\n",
    "config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')\n",
    "with open(config_file) as f:\n",
    "    data = f.read()\n",
    "\n",
    "global h\n",
    "json_config = json.loads(data)\n",
    "h = AttrDict(json_config)\n",
    "\n",
    "torch.manual_seed(h.seed)\n",
    "global device\n",
    "if torch.cuda.is_available():\n",
    "    torch.cuda.manual_seed(h.seed)\n",
    "    device = torch.device('cuda')\n",
    "else:\n",
    "    device = torch.device('cpu')\n",
    "\n",
    "inference(a)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fairseq",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
